Curious what congress and Trump are saying about the coronavirus?
This analysis takes a look at the number, the positive and negative sentiment, and the content of the tweets, for each party and how that’s evolving over time since February 1, 2020.
I utilize open data hosted online. In particular, big thanks to Alex Litel who created the Tweets of Congress repo, where I pulled congressional tweets from, and the folks running Trump Twitter Archive, where I pulled Trump’s tweets from.
The repo for this project is: https://github.com/dcosme/congress-tweets-covid19
load packages
library(tidyverse)
library(jsonlite)
library(tidytext)
library(ggwordcloud)
library(knitr)
library(DT)
library(drlib)define palettes
palette4 = wesanderson::wes_palette("Zissou1", 4, "continuous")
palette4 = c(palette4[1], palette4[2], palette4[4], palette4[3])
palette5 = wesanderson::wes_palette("Zissou1", 5, "continuous")
palette2 = c(palette5[2], palette5[4])load congress twitter handles
congress_twitter = read.csv("~/Documents/code/US-Congress/116thCongress/116Congress.csv", stringsAsFactors = FALSE) %>%
rename("name" = Wikipedia..Names) %>%
gather(handle_type, twitter_handle, ODU.WSDL, CSPAN, TweetCongress, Github) %>%
select(name, handle_type, twitter_handle) %>%
mutate(twitter_handle = tolower(twitter_handle),
twitter_handle = ifelse(twitter_handle == "", NA, twitter_handle),
name = gsub("<e9>", "é", `Encoding<-`(name, "latin1"), fixed = TRUE),
name = gsub("<e1>", "á", `Encoding<-`(name, "latin1"), fixed = TRUE),
name = gsub("<fa>", "ú", `Encoding<-`(name, "latin1"), fixed = TRUE),
name = gsub("<ed>", "í", `Encoding<-`(name, "latin1"), fixed = TRUE),
name = gsub("é", "e", name),
name = gsub("á", "a", name),
name = gsub("ú", "u", name),
name = gsub("í", "i", name),
name = trimws(name)) %>%
extract(name, c("first", "last"), "([A-Za-z]{1}).* (.*)", remove = FALSE) %>%
spread(handle_type, twitter_handle)
congress = read.csv("~/Documents/code/us-senate/us-senate/data/us-senate.csv", stringsAsFactors = FALSE) %>%
bind_rows(read.csv("~/Documents/code/us-house/us-house/data/us-house.csv", stringsAsFactors = FALSE)) %>%
select(state_name, title, party, name, gender, ethnicity, twitter_handle) %>%
mutate(twitter_handle = tolower(twitter_handle),
twitter_handle = ifelse(twitter_handle == "", NA,
ifelse(twitter_handle == "housedemocrats", NA,
ifelse(twitter_handle == "senatorloeffler?lang=en", NA, twitter_handle))),
name = gsub("é", "e", name),
name = gsub("á", "a", name),
name = gsub("ú", "u", name),
name = gsub("í", "i", name),
name = trimws(name)) %>%
extract(name, c("first", "last"), "([A-Za-z]{1}).* (.*)", remove = FALSE)
congress_info = full_join(congress, congress_twitter, by = c("first", "last")) %>%
gather(handle_type, twitter_handle, twitter_handle, ODU.WSDL, CSPAN, TweetCongress, Github) %>%
select(state_name, title, party, first, last, gender, ethnicity, twitter_handle) %>%
group_by(first, last) %>%
fill(state_name, title, party, gender, ethnicity, twitter_handle, .direction = "updown") %>%
unique() %>%
filter(!is.na(state_name)) %>%
ungroup() %>%
mutate(last = tolower(last))load congressional tweets
corona|virus|covid|flu|social distancing|pandemicpull to update repo
## From https://github.com/alexlitel/congresstweets
## * branch master -> FETCH_HEAD
## Already up to date.
define keywords and words to ignore
keywords = "corona|virus|covid|flu|social distancing|pandemic"
ignore_root_words = "http|.com|img|jpg|video|live|index|[0-9]|corona|covid|vid|ncov|aspx|utm|t.co|png"
ignore_words = c("rt", "amp", "qt", "pu", "tag", "i'm", "it's", "i’m", "it’s", "lr", "li", "ag")load the files
file_dir = "~/Documents/code/congresstweets/data"
file_pattern = "2020-0[2-3]{1}-.*.json"
file_list = list.files(file_dir, pattern = file_pattern)
tweets_temp = data.frame()
for (file in file_list) {
temp = tryCatch(jsonlite::stream_in(file(file.path(file_dir, file)), verbose = FALSE), error = function(e) message(file))
tweets_temp = rbind(tweets_temp, temp)
rm(temp)
}
tweets_all = tweets_temp %>%
rename("twitter_handle" = screen_name) %>%
select(twitter_handle, time, text) %>%
mutate(twitter_handle = tolower(twitter_handle),
text = tolower(text)) %>%
filter(grepl(keywords, text)) %>%
rename("day" = time)find missing congressional twitter handles
missing = tweets_all %>%
left_join(., congress_info) %>%
filter(is.na(last)) %>%
select(twitter_handle) %>%
unique()
last_names = congress_info %>%
ungroup() %>%
select(last) %>%
unique() %>%
mutate(last = tolower(last))
missing$last <- sapply(missing$twitter_handle, function(handle) {
last <- last_names$last[sapply(last_names$last, grepl, handle)]
})
missing %>%
unnest(last, keep_empty = TRUE) %>%
mutate(first = toupper(substring(twitter_handle, 1, 1)),
last = ifelse(is.na(last), "", last)) %>%
write.csv(., "missing.csv", row.names = FALSE)
missing_edited = read.csv("missing_edited.csv", stringsAsFactors = FALSE)
congress_full = congress_info %>%
full_join(., missing_edited, by = c("first", "last")) %>%
gather(var, twitter_handle, contains("twitter_handle")) %>%
select(-var) %>%
unique() %>%
filter(!is.na(twitter_handle)) %>%
mutate(party = ifelse(party.y == "" | is.na(party.y), party.x, party.y)) %>%
select(-c(party.x, party.y)) %>%
unique() %>%
filter(!is.na(party))
congress_tweets = tweets_all %>%
left_join(., congress_full) %>%
mutate(day = lubridate::as_date(day),
week = lubridate::floor_date(day, "week"),
month = lubridate::floor_date(day, "month"))
# congress_tweets %>%
# filter(is.na(party)) %>%
# select(twitter_handle) %>%
# unique()load trump tweets
get_tweets = function(year, fromJSON = TRUE) {
## build and send request
url <- paste0(
"http://trumptwitterarchive.com/",
"data/realdonaldtrump/",
year,
".json"
)
## response object
r <- httr::GET(url)
## check html status
httr::warn_for_status(r)
## if fromJSON then convert to list otherwise return response object
if (fromJSON) {
r <- httr::content(r, "text")
## if html return empty data frame
if (grepl("^\\<\\!DOCTYPE", r)) {
r <- data.frame()
} else {
r <- jsonlite::fromJSON(r)
}
}
r
}
trump_tweets = get_tweets(year = 2020) %>%
mutate(twitter_handle = "realdonaldtrump",
day = as.POSIXct(created_at, format = "%a %b %d %H:%M:%S %z %Y"),
day = lubridate::as_date(day),
week = lubridate::floor_date(day, "week"),
month = lubridate::floor_date(day, "month"),
party = "trump",
first = "D",
last = "trump",
title = "president") %>%
select(twitter_handle, text, day, week, month, party) %>%
filter(grepl("2020-02|2020-03", month)) %>%
mutate(text = tolower(text)) %>%
filter(grepl(keywords, text))merge tweets
define wordcloud function
plot_number = function(data, timescale = "week", start_date = NULL, palette = palette4) {
if (!is.null(start_date)) {
data = data %>%
filter(day >= start_date)
}
plot1 = data %>%
filter(!is.na(party)) %>%
ggplot(aes(!!sym(timescale), fill = party)) +
geom_bar(stat = "count") +
scale_fill_manual(name = "", values = palette) +
labs(x = "", y = "number of tweets\n") +
theme_minimal(base_size = 14) +
theme(legend.position = "top")
plot2 = data %>%
filter(!is.na(party)) %>%
ggplot(aes(!!sym(timescale), color = party)) +
geom_line(stat = "count") +
scale_color_manual(name = "", values = palette) +
labs(x = "", y = "number of tweets\n") +
theme_minimal(base_size = 14) +
theme(legend.position = "top")
return(list(plot1 = plot1, plot2 = plot2))
}
plot_sentiment = function(data, n_words = 20,
start_date = NULL, duration = NULL,
palette = palette2) {
if (!is.null(start_date)) {
if (!is.null(duration)) {
data = data %>%
filter(day >= start_date & day <= lubridate::date(start_date) + lubridate::days(duration))
} else {
data = data %>%
filter(day >= start_date)
}
}
sentiments = data %>%
unnest_tokens(word, text) %>%
inner_join(tidytext::get_sentiments("bing")) %>%
anti_join(stop_words, by = "word") %>%
filter(!grepl(ignore_root_words, word)) %>%
filter(!word %in% ignore_words) %>%
filter(!word == "trump") %>%
group_by(party) %>%
count(word, sentiment, sort = TRUE) %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
top_n(n_words) %>%
group_by(party) %>%
arrange(n) %>%
mutate(order = row_number())
sentiments %>%
ggplot(aes(drlib::reorder_within(word, n, party), n, fill = sentiment)) +
geom_col() +
drlib::scale_x_reordered() +
facet_wrap(~party, scales = "free") +
labs(y = "\nnumber of times tweeted",
x = NULL) +
coord_flip() +
scale_fill_manual(name = "", values = palette) +
theme_minimal(base_size = 14) +
theme(legend.position = "top")
}
plot_content = function(data, party=NULL, start_date=NULL, duration=NULL, n_words=50, n_colors=6, size=20) {
data = data %>%
mutate(day = lubridate::as_date(day))
if (!is.null(party)) {
data = data %>%
filter(party == !!party)
}
if (!is.null(start_date)) {
if (!is.null(duration)) {
data = data %>%
filter(day >= start_date & day <= lubridate::date(start_date) + lubridate::days(duration))
} else {
data = data %>%
filter(day >= start_date)
}
}
palette = wesanderson::wes_palette("Zissou1", n_colors, "continuous")
set.seed(42)
plot = data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
select(text, party) %>%
unnest_tokens(word, text) %>%
group_by(party) %>%
count(word, sort = TRUE) %>%
anti_join(stop_words, by = "word") %>%
filter(!grepl(ignore_root_words, word)) %>%
filter(!word %in% ignore_words) %>%
slice(1:n_words) %>%
mutate(word = gsub("[[:punct:]]", "", word),
sum = sum(n),
size = n / sum,
tile = ntile(n, n_colors)) %>%
ggplot(aes(label = word, size = size, color = as.factor(tile))) +
geom_text_wordcloud_area(shape = "square", rm_outside = TRUE) +
scale_size_area(max_size = size, trans = power_trans(1/.7)) +
scale_color_manual(values = palette) +
facet_wrap(~party) +
theme_minimal() +
theme(strip.text.x = element_text(size = 12))
return(list(plot = plot, data = data))
}How many times has Congress and the President tweeted about COVID-19?
How positive and negative is the content of the tweets?
Here is a list of the top 20 positive or negative words for each party and the President.
Here is a list of the top 10 positive or negative words for each party and the President by week.
What are Congress and the President saying about COVID-19?
Here are 100 the most frequently used words by each party and the President.
p$data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
group_by(party) %>%
summarize(`number of tweets` = n()) %>%
kable(format = "pandoc")| party | number of tweets |
|---|---|
| democrat | 10836 |
| republican | 8598 |
| trump | 110 |
p$data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
group_by(party) %>%
summarize(`number of tweets` = n()) %>%
kable(format = "pandoc")| party | number of tweets |
|---|---|
| democrat | 1917 |
| republican | 1112 |
| trump | 24 |
Here are 50 the most frequently used words by each party and the President for each week in February and March.
p$data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
group_by(party) %>%
summarize(`number of tweets` = n()) %>%
kable(format = "pandoc")| party | number of tweets |
|---|---|
| democrat | 199 |
| republican | 133 |
| trump | 2 |
p$data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
group_by(party) %>%
summarize(`number of tweets` = n()) %>%
kable(format = "pandoc")| party | number of tweets |
|---|---|
| democrat | 176 |
| republican | 162 |
| trump | 1 |
p$data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
group_by(party) %>%
summarize(`number of tweets` = n()) %>%
kable(format = "pandoc")| party | number of tweets |
|---|---|
| democrat | 103 |
| republican | 80 |
| trump | 1 |
p$data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
group_by(party) %>%
summarize(`number of tweets` = n()) %>%
kable(format = "pandoc")| party | number of tweets |
|---|---|
| democrat | 1222 |
| republican | 682 |
| trump | 19 |
p$data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
group_by(party) %>%
summarize(`number of tweets` = n()) %>%
kable(format = "pandoc")| party | number of tweets |
|---|---|
| democrat | 2580 |
| republican | 2801 |
| trump | 24 |
p$data %>%
filter(party %in% c("democrat", "republican", "trump")) %>%
group_by(party) %>%
summarize(`number of tweets` = n()) %>%
kable(format = "pandoc")| party | number of tweets |
|---|---|
| democrat | 5437 |
| republican | 4171 |
| trump | 55 |
Who’s tweeting the most and what are they tweeting?
tweets %>%
filter(!is.na(party)) %>%
group_by(state_name, title, first, last, twitter_handle, party, gender, ethnicity) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
ungroup() %>%
rename("state" = state_name,
"twitter handle" = twitter_handle) %>%
mutate(state = as.factor(state),
title = as.factor(title),
party = as.factor(party),
gender = as.factor(gender),
ethnicity = as.factor(ethnicity)) %>%
DT::datatable(filter = "top", rownames = FALSE)tweets %>%
filter(!is.na(party)) %>%
select(state_name, title, first, last, twitter_handle, party, gender, ethnicity, day, text) %>%
arrange(twitter_handle) %>%
unnest_tokens(word, text) %>%
group_by(state_name, title, first, last, twitter_handle, party, gender, ethnicity) %>%
count(word, sort = TRUE) %>%
anti_join(stop_words, by = "word") %>%
filter(!grepl(ignore_root_words, word)) %>%
filter(!word %in% ignore_words) %>%
ungroup() %>%
rename("state" = state_name,
"twitter handle" = twitter_handle) %>%
mutate(state = as.factor(state),
title = as.factor(title),
party = as.factor(party),
gender = as.factor(gender),
ethnicity = as.factor(ethnicity)) %>%
DT::datatable(filter = "top", rownames = FALSE)